cap log close
log using "$log/05_no_drop", text replace


* Note: no drop == not dropping persons with training < 6 month
* We are still dropping people with trainings > 4 years
* Otherwiese reruns do-files 01_* to 04_*

use beh_pers_gr spell_length persnr begorig endorig year beh_beruf_num betnr  beh_ausbildung sex_id beh_staat_dummy age spell using  $BeH_large, clear 
set seed 666

keep if inlist(beh_pers_gr,102,141)

sort persnr begorig endorig
local l = 1
while `l' > 0 {
*drop inclosed and parralel spells
drop if ///
persnr == persnr[_n-1] & ///
begorig[_n-1] <= begorig & /// 
endorig[_n-1] >= endorig & /// 
year == year[_n-1]
local l = r(N_drop) 
}

*******************************************************************************
* Combine spells within year if same betnr and 2-digit-job

gen beruf = floor(beh_beruf_num/10)
replace beruf = -1 if beruf < 0 | inlist(beh_beruf_num,-5, 981,982)

merge m:1 betnr using "$temp/wz_bula.dta", keepusing(bula ind)
drop if _merge == 2
drop _merge

bysort persnr begorig: gen n = _n
gen temp = .
replace temp = age if n == 1
by persnr: egen train_start_age = min(temp)
drop temp

gen temp = .
replace temp = year if n == 1
by persnr: egen train_start_year = min(temp)
drop temp

gen temp = .
replace temp = bula if n == 1
by persnr: egen train_start_bula = min(temp)
drop temp

gen temp = .
replace temp = ind if n == 1
by persnr: egen train_start_ind = min(temp)
drop temp

gen temp = .
replace temp = beruf if n == 1
by persnr: egen first_train_beruf = min(temp)
drop n temp

sort persnr begorig betnr beruf

gen tag = 1 if ///
persnr == persnr[_n-1] & ///
betnr == betnr[_n-1] & ///
(beruf == beruf[_n-1] | beruf==-1 | beruf[_n-1]==-1) ///
& endorig[_n-1] + 91 >= begorig /// 
& year == year[_n-1] 

replace begorig = begorig[_n-1] if tag==1
replace spell_length = endorig - begorig + 1

replace beruf = beruf[_n-1] if beruf==-1 & beruf[_n-1]!=-1 & tag==1
replace beh_ausbildung = beh_ausbildung[_n-1] if beh_ausbildung==-7 & beruf[_n-1]!=-7 & tag==1
drop if tag[_n+1] == 1
drop tag

*******************************************************************************
* Combine spells within year if same 2-digit-job

sort persnr begorig beruf

gen tag = 1 if ///
persnr == persnr[_n-1] & ///
(beruf == beruf[_n-1] | beruf==-1 | beruf[_n-1]==-1) ///
& endorig[_n-1] + 91 >= begorig  ///
& year == year[_n-1] 

replace begorig = begorig[_n-1] if tag==1
replace spell_length = endorig - begorig + 1

replace beruf = beruf[_n-1] if beruf==-1 & beruf[_n-1]!=-1 & tag==1
replace beh_ausbildung = beh_ausbildung[_n-1] if beh_ausbildung==-7 & beruf[_n-1]!=-7 & tag==1
drop if tag[_n+1] == 1
drop tag

*******************************************************************************
* Combine spells over year if same betnr and 2-digit-job

sort persnr begorig betnr beruf

gen tag = 1 if ///
persnr == persnr[_n-1] & ///
betnr == betnr[_n-1] & ///
(beruf == beruf[_n-1] | beruf==-1 | beruf[_n-1]==-1) ///
& endorig[_n-1] + 91 >= begorig

replace begorig = begorig[_n-1] if tag==1
replace spell_length = endorig - begorig + 1

replace beruf = beruf[_n-1] if beruf==-1 & beruf[_n-1]!=-1 & tag==1
replace beh_ausbildung = beh_ausbildung[_n-1] if beh_ausbildung==-7 & beruf[_n-1]!=-7 & tag==1
drop if tag[_n+1] == 1
drop tag

*******************************************************************************
* Combine spells over year if same 2-digit-job

sort persnr begorig beruf

gen tag = 1 if ///
persnr == persnr[_n-1] & ///
(beruf == beruf[_n-1] | beruf==-1 | beruf[_n-1]==-1) ///
& endorig[_n-1] + 91 >= begorig

replace begorig = begorig[_n-1] if tag==1
replace spell_length = endorig - begorig + 1

replace beruf = beruf[_n-1] if beruf==-1 & beruf[_n-1]!=-1 & tag==1
replace beh_ausbildung = beh_ausbildung[_n-1] if beh_ausbildung==-7 & beruf[_n-1]!=-7 & tag==1
drop if tag[_n+1] == 1
drop tag

su spell_length, de
cap drop n

bysort persnr: egen max_training = max(spell_length)
bysort persnr: egen min_training = min(spell_length)
bysort persnr: egen no_trainings  = count(persnr)

by persnr: gen n = _n
tab no_trainings if n == 1

drop if no_trainings == 1 & !inrange(spell_length,0,1465)

gen keep = (no_trainings == 1 & inrange(spell_length,0,1465))
tab keep
replace keep = 1 if no_trainings > 1 & inrange(spell_length,0,1465)
tab keep

bysort persnr: egen min_keep = max(keep)
cap drop n
bysort persnr: gen n = _n
tab min_keep if n == 1
drop min_keep n

drop if no_trainings > 1 & max_training < 0
drop if no_trainings > 1 & min_training > 1465

drop if no_trainings > 1 & spell_length < 0
drop if no_trainings > 1 & spell_length > 1465

drop min_training max_training no_trainings keep  

cap drop n
bysort persnr: egen no_trainings  = count(persnr)
bysort persnr: gene n = _n
tab no_trainings if n == 1

bysort persnr: egen min_endorig = min(endorig)
drop if no_trainings > 1 & endorig != min_endorig

drop min_endorig

rename endorig        train_end
rename spell_length   train_lenght
rename begorig        train_start
rename beruf          train_occ
rename betnr          train_firm
rename beh_ausbildung train_edu
clonevar day_entry = train_end
gen year_entry     = year(day_entry)
gen entry_german = (beh_staat_dummy==1)
gen entry_gender = (sex_id == 2)
rename age train_end_age

replace train_edu = 1 if train_edu == 2
replace train_edu = 3 if train_edu == 4
replace train_edu = 5 if train_edu == 6
label define train_edu -7 "Missing" 0 "No degree" 1 "Volks-,Haupt-,Realschule" 3 "Abitur" 5 "Hochschule"
label values train_edu train_edu

keep persnr train_end train_lenght train_start train_firm train_occ day_entry train_edu year_entry entry_german entry_gender no_trainings train_end_age first_train_beruf spell train_start_age train_start_year train_start_bula train_start_ind

gen same_train_occ = (first_train_beruf==train_occ)
tab same_train_occ, mis

su train_lenght, de
tab train_edu, mis

compress
save "$temp/train_end_no_drop", replace


*end of 06
********************************************************************************
********************************************************************************
*start of 07

use $BeH_large if inrange(year,1998,2018), clear

set seed 20210505

merge m:1 persnr using "$temp/train_end_no_drop", keepusing(train_end train_end_age)
drop if _merge == 1
drop if _merge == 2
drop _merge

cap rename tentgelt wage
cap rename beh_ausbildung bild
cap rename beh_beruf_num beruf

replace beruf = floor(beruf/10)

compress
*Trained and received certificate
gen x = (inlist(beh_pers_gr,102,141))
bysort persnr: egen trained = max(x)
drop x
compress

gen x = (inlist(bild,2,4))
bysort persnr: egen certificate = max(x)
drop x 
keep if trained==1 		
compress

drop if endorig <= train_end
drop train_end

bysort persnr: egen x = min(begorig) if !inlist(beh_pers_gr,102,141)
bysort persnr: egen first = mean(x)
drop x	
 
drop if train_end_age > 28
drop if train_end_age < 18
drop train_end_age

tab year if missing(first)
tab beh_pers_gr if begorig < first

gen tag = (begorig < first)
bysort persnr: egen tag2 = max(tag)
sort persnr begorig endorig
drop tag tag2

merge m:1 persnr using "$temp/train_end_no_drop"
drop if _merge == 1
drop if _merge == 2
drop _merge
	
compress
save "$data\full_with_E_new_no_drop.dta", replace

*end of 07
********************************************************************************
********************************************************************************
*start of 08_01

rename wage tentgelt
bysort persnr year: egen c = count(betnr)
su c
keep if c == 1
drop c
save "$temp/singeltons", replace

********************************************************************************
********************************************************************************


use "$data\full_with_E_new_no_drop.dta", clear

bysort persnr year: egen c = count(betnr)
su c
drop if c == 1
drop c

rename wage tentgelt

*** Clean data (delete parallel spell, keep main employment spell)
********************************************************************************
** Clean data within establishment / year

* 1) Drop certain duplicates
duplicates drop persnr betnr tentgelt begorig endorig pt, force

* 2) Delete (short) enclosed spells & same same start of spells, different length => keep longer spell
* 2.1) start new, fast code if unique longest spell within year
bysort persnr year betnr: egen max_spell = max(spell_length)
gen tag = (spell_length == max_spell)
drop max_spell
bysort persnr year betnr: egen count_tag = sum(tag)
replace tag = 0 if count_tag > 1
drop count_tag
gen beg = begorig if tag == 1
gen end = endorig if tag == 1
drop tag
bysort persnr year betnr: egen beg1 = max(beg)
bysort persnr year betnr: egen end1 = max(end)
drop beg end
gen beg = begorig
gen end = endorig 
drop if (begorig >= beg1 & endorig < end1) | ///
(begorig > beg1 & endorig <= end1) & !missing(beg1) & !missing(end1)
drop beg1 end1 beg end

* 2.2) using loop, slower, but works with multiple longest spells within year
sort persnr betnr begorig endorig
local n = 1
while `n' > 0 {
drop if persnr == persnr[_n-1] & betnr == betnr[_n-1] & ///
((begorig >= begorig[_n-1] & endorig < endorig[_n-1]) | ///
(begorig > begorig[_n-1] & endorig <= endorig[_n-1]))
local n = r(N_drop)
}

* 3) Deal with overlapping spells
local n = 2
while `n' > 1 {
sort persnr betnr begorig endorig
gen tag = (persnr == persnr[_n-1] & betnr == betnr[_n-1] & ///
begorig >= begorig[_n-1] & begorig <= endorig[_n-1] & endorig > endorig[_n-1])
tab tag, mis
local n = r(r)
replace begorig = (endorig[_n-1] + 1) if tag == 1
drop tag
}
drop spell_length
gen spell_length = (endorig - begorig + 1)

* 4) Deal with parallel spells
* 4a) different wage, given ft/pt => keep spell with higher wage
duplicates tag persnr betnr begorig endorig pt, g(tag)
tab tag, mis
bysort persnr betnr begorig endorig pt: egen double maxwage = max(tentgelt) if tag > 0
drop if tentgelt < maxwage & tag > 0
drop maxwage tag

* 4b) different wage => keep spell with higher wage
duplicates tag persnr betnr begorig endorig   , g(tag)
tab tag, mis
bysort persnr betnr begorig endorig       : egen double maxwage = max(tentgelt) if tag > 0
drop if tentgelt < maxwage & tag > 0
drop maxwage tag

* 4c) same wage: keep ft rather than pt
duplicates tag persnr betnr begorig endorig tentgelt, g(tag)
tab tag, mis
bysort persnr betnr begorig endorig tentgelt: egen m_pt = mean(pt) if tag > 0
drop if pt == 1 & tag > 0 & m_pt < 1 // parallel ft and pt spells, drop pt spell(s)
drop m_pt tag

* 4d) drop remaining duplicates 
duplicates drop persnr betnr tentgelt begorig endorig, force // drop duplicates within establishments

* 5) combine consecutive spells within ft and pt
sort persnr betnr begorig endorig pt
gen tag = (persnr == persnr[_n-1] & betnr == betnr[_n-1] & ///
year(begorig) == year(begorig[_n-1]) & begorig - 1 == endorig[_n-1] & ///
pt == pt[_n-1])
gen tag2 = (tag[_n+1] == 1)
replace tag2 = 1 if tag == 1
replace tag2 = tag2 + tag2[_n-1] if tag2 != 0 & tag != 0 
su tag2
local max = (r(max) - 1)

forv i = 1/`max' {
gen duration = endorig - begorig + 1  
replace begorig = begorig[_n-1] if tag == 1 & tag2 == (`i' + 1)
replace tentgelt = (tentgelt * duration + tentgelt[_n-1] * duration[_n-1])/(duration + duration[_n-1]) if tag == 1 & tag2 == (`i' + 1) 
drop if tag2 == `i' & tag2[_n+1] == (`i' + 1)
drop duration
}
replace tentgelt = round(tentgelt,.01)
drop spell_length
gen spell_length = (endorig - begorig + 1)

drop tag tag2

********************************************************************************
** Assert that there are no parallel or overlapping spells within establishments

*Parallel spells within establishments
duplicates tag persnr betnr begorig endorig, g(tag)
assert tag == 0
drop tag

*Overlapping spells within establishments
gen tag = (persnr == persnr[_n-1] & betnr == betnr[_n-1] & begorig <= endorig[_n-1] & (begorig != begorig[_n-1] | endorig != endorig[_n-1]))
assert tag == 0
drop tag

append using "$temp/singeltons"
erase "$temp/singeltons.dta" // INCLUDE LINE AGAIN

count

save "$data\full_with_E_new_clean_no_drop.dta", replace


*end of 08_01
********************************************************************************
********************************************************************************
*start of 08_02


cap drop probies
rename entry_german german
rename entry_gender gender
rename tentgelt wage 
rename train_lenght train_length

local stablevars = "trained certificate train_end train_end_age first train_firm gender german train_occ day_entry year_entry no_trainings train_edu train_length train_start train_start_age train_start_year train_start_bula train_start_ind first_train_beruf "

*Make sure time-invariant variables are non-missing for all person-year observations
foreach var in `stablevars' {
	bysort persnr: egen double check = mean(`var')
	assert check==`var'
	drop check
	}

gen spell_earnings = spell_length * wage
gen spell_earnings_ft	  = spell_length * wage if pt == 0
gen spell_length_ft   = spell_length if pt == 0


rename beruf beruf_org
count if missing(beruf_org)
bysort persnr betnr year: egen max_spell = max(spell_length)
gen beruf = beruf_org if spell_length == max_spell
drop max_spell
count if missing(beruf)

bysort persnr betnr year: egen beruf_mode = mode(beruf), maxmode
count if missing(beruf_mode)
drop beruf
rename beruf_mode beruf

keep `stablevars' pt beruf same_train_occ spell_length spell_length_ft spell_earnings spell_earnings_ft persnr betnr year
compress

fcollapse (mean) `stablevars' pt beruf same_train_occ (sum) spell_length spell_length_ft spell_earnings spell_earnings_ft, by(persnr betnr year) pool(15)

su spell_length, de
su spell_length_ft, de

gen wage_ft = spell_earnings_ft / spell_length_ft
gen wage    = spell_earnings    / spell_length
rename spell_earnings_ft est_earnings_ft
rename spell_earnings est_earnings

su pt, de
count if pt == 0 // pure full-time
count if pt > 0 & pt < 1 // full- and part-time 
count if pt == 1 // pure part-time

bysort persnr year: egen double aearnings_ft = sum(est_earnings_ft)
bysort persnr year: egen double aearnings    = sum(est_earnings)

*************************
* DOMINANT Establishment EARNINGS *
*************************

bysort persnr year: egen double temp = max(est_earnings_ft)
gen dom_job_ft = (est_earnings_ft == temp & est_earnings_ft!=0)
drop temp

bysort persnr year: egen double temp = max(est_earnings)
gen dom_job    = (est_earnings == temp & est_earnings!=0)
drop temp

tab dom_job_ft, mis
tab dom_job, mis

* Training industry - merge ind for train_firm
rename betnr firm_current 
rename train_firm betnr
merge m:1 betnr using "$temp/wz_bula", keepusing(ind bula)
drop if _merge==2
drop _merge 
rename ind train_ind
rename bula region_entry
rename betnr train_firm 
rename firm_current betnr  

* Current establishment: Merge ind and bula
merge m:1 betnr using "$temp/wz_bula", keepusing(ind bula)
rename ind current_ind
rename bula region_current
drop if _merge==2
drop _merge 

su current_ind
gen ind_2dig = floor(current_ind/10)
su ind_2dig

gen train_ind_2dig = floor(train_ind/10)
su train_ind
su train_ind_2dig

drop current_ind train_ind
rename train_ind_2dig train_ind
rename ind_2dig       current_ind

*Merge in firm characteristics
merge m:1 betnr year using "$temp/BHP_vars_year.dta"
drop if _merge==2
drop _merge
rename az_ges empl_current
rename az_azubi trainees_current
rename te_med med_wage_current
rename te_imp_med med_imp_wage_current

*Merge in training firm characteristics
rename betnr betnr_current
rename train_firm betnr
rename year year_current
rename year_entry year
merge m:1 betnr year using "$temp/BHP_vars_year.dta"
drop if _merge==2
drop _merge
rename az_ges empl_train
rename az_azubi trainees_train
rename te_med med_wage_train
rename te_imp_med med_imp_wage_train
rename year year_entry
rename year_current year
rename betnr train_firm
rename betnr_current betnr
		
*Working at training firm?
gen work_at_train = (betnr==train_firm)

*Work in training occ?
replace beruf     = . if beruf == -1
replace train_occ = . if train_occ == -1
gen work_in_train_occ = (beruf==train_occ)
replace work_in_train_occ = . if missing(beruf)
tab work_in_train_occ, mis

gen work_in_train_ind = (current_ind==train_ind) 
replace work_in_train_ind = . if missing(current_ind)
tab work_in_train_ind, mis

compress

*******************************
* MERGE IN UNEMPLOYMENT RATES *
*******************************

	
merge m:1 year_entry region_entry using "$temp\U_internal.dta"
	drop if _merge==2
	drop _merge

drop if train_occ==98 | train_occ==55

gen month_entry = month(day_entry)

local vars "wage med_imp_wage_train"
foreach var in `vars' {
	gen log_`var' = ln(`var'+1)
	}

gen potential_experience =  year-year_entry

compress
save "$temp/analysis_pt_new_no_drop", replace

********************************************************************************

set seed 666
bysort persnr: gen n = _n
gen x = floor(uniform()*100)+1 if n == 1
bysort persnr: egen rand = mean(x)
drop x n
su rand
keep if rand <= 2 

compress
save "$temp/analysis_pt_2p_new_no_drop", replace

clear
cap log close

